In [1]:
# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

In [2]:
# get rossmann, store, & test csv files as a DataFrame
rossmann_df  = pd.read_csv("train.csv")
store_df     = pd.read_csv("store.csv")
test_df      = pd.read_csv("test.csv")

# preview the data
rossmann_df.head()


C:\Users\Administrator\Anaconda2\envs\gl-env\lib\site-packages\IPython\core\interactiveshell.py:2723: DtypeWarning: Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[2]:
Store DayOfWeek Date Sales Customers Open Promo StateHoliday SchoolHoliday
0 1 5 2015-07-31 5263 555 1 1 0 1
1 2 5 2015-07-31 6064 625 1 1 0 1
2 3 5 2015-07-31 8314 821 1 1 0 1
3 4 5 2015-07-31 13995 1498 1 1 0 1
4 5 5 2015-07-31 4822 559 1 1 0 1

In [4]:
# Open
fig, (axis1) = plt.subplots(1,1,figsize=(15,4))
sns.countplot(x='Open',hue='DayOfWeek', data=rossmann_df,palette="husl", ax=axis1)

#默认周天以外的店铺状态都为“open”
test_df["Open"][test_df["Open"] != test_df["Open"]] = (test_df["DayOfWeek"] != 7).astype(int)

# Drop Open column
# rossmann_df.drop("Open", axis=1, inplace=True)
# test_df.drop("Open", axis=1, inplace=True)


C:\Users\Administrator\Anaconda2\envs\gl-env\lib\site-packages\ipykernel\__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [5]:
# Date


rossmann_df['Year']  = rossmann_df['Date'].apply(lambda x: int(str(x)[:4]))
rossmann_df['Month'] = rossmann_df['Date'].apply(lambda x: int(str(x)[5:7]))

test_df['Year']  = test_df['Date'].apply(lambda x: int(str(x)[:4]))
test_df['Month'] = test_df['Date'].apply(lambda x: int(str(x)[5:7]))

# Assign Date column to Date(Year-Month) instead of (Year-Month-Day)
#将Date列添加给Date(Year-Month) 取代 (Year-Month-Day)


rossmann_df['Date'] = rossmann_df['Date'].apply(lambda x: (str(x)[:7]))
test_df['Date']     = test_df['Date'].apply(lambda x: (str(x)[:7]))


average_sales    = rossmann_df.groupby('Date')["Sales"].mean()
pct_change_sales = rossmann_df.groupby('Date')["Sales"].sum().pct_change()

fig, (axis1,axis2) = plt.subplots(2,1,sharex=True,figsize=(15,8))

# plot average sales over time(year-month)
ax1 = average_sales.plot(legend=True,ax=axis1,marker='o',title="Average Sales")
ax1.set_xticks(range(len(average_sales)))
ax1.set_xticklabels(average_sales.index.tolist(), rotation=90)

# plot precent change for sales over time(year-month)
ax2 = pct_change_sales.plot(legend=True,ax=axis2,marker='o',rot=90,colormap="summer",title="Sales Percent Change")
# ax2.set_xticks(range(len(pct_change_sales)))
# ax2.set_xticklabels(pct_change_sales.index.tolist(), rotation=90)



In [6]:
#比较年份的销售和每年到访的用户数量

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='Year', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='Year', y='Customers', data=rossmann_df, ax=axis2)

# Drop Date column
# rossmann_df.drop(['Date'], axis=1,inplace=True)
# test_df.drop(['Date'], axis=1,inplace=True)


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x1600a048>

In [7]:
# Customers

fig, (axis1,axis2) = plt.subplots(2,1,figsize=(15,8))

# Plot max, min values, & 2nd, 3rd quartile
sns.boxplot([rossmann_df["Customers"]], whis=np.inf, ax=axis1)

# group by date and get average customers, and precent change
average_customers      = rossmann_df.groupby('Date')["Customers"].mean()
# pct_change_customers = rossmann_df.groupby('Date')["Customers"].sum().pct_change()

# Plot average customers over the time
# it should be correlated with the average sales over time
ax = average_customers.plot(legend=True,marker='o', ax=axis2)
ax.set_xticks(range(len(average_customers)))
xlabels = ax.set_xticklabels(average_customers.index.tolist(), rotation=90)



In [8]:
# DayOfWeek
# In both cases where the store is closed and opened

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='DayOfWeek', y='Sales', data=rossmann_df, order=[1,2,3,4,5,6,7], ax=axis1)
sns.barplot(x='DayOfWeek', y='Customers', data=rossmann_df, order=[1,2,3,4,5,6,7], ax=axis2)


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b557f98>

In [9]:
# Promo

# Plot average sales & customers with/without promo
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='Promo', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='Promo', y='Customers', data=rossmann_df, ax=axis2)

#可以看出是否进行促销对销售额与顾客量有明显的影响


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x1cb42940>

In [10]:
# StateHoliday

# StateHoliday column has values 0 & "0", So, we need to merge values with 0 to "0"
rossmann_df["StateHoliday"].loc[rossmann_df["StateHoliday"] == 0] = "0"
# test_df["StateHoliday"].loc[test_df["StateHoliday"] == 0] = "0"

# Plot
sns.countplot(x='StateHoliday', data=rossmann_df)

# Before
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df, ax=axis1)

mask = (rossmann_df["StateHoliday"] != "0") & (rossmann_df["Sales"] > 0)
sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df[mask], ax=axis2)


C:\Users\Administrator\Anaconda2\envs\gl-env\lib\site-packages\pandas\core\indexing.py:128: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d52c9e8>

In [11]:
# .... continue with StateHoliday

rossmann_df["StateHoliday"] = rossmann_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})
test_df["StateHoliday"]     = test_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='StateHoliday', y='Customers', data=rossmann_df, ax=axis2)


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0xe02b7b8>

In [12]:
# SchoolHoliday

# Plot
sns.countplot(x='SchoolHoliday', data=rossmann_df)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='SchoolHoliday', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='SchoolHoliday', y='Customers', data=rossmann_df, ax=axis2)


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1db2c390>

In [13]:
# Sales

fig, (axis1,axis2) = plt.subplots(2,1,figsize=(15,8))

# Plot max, min values, & 2nd, 3rd quartile
sns.boxplot([rossmann_df["Customers"]], whis=np.inf, ax=axis1)

# Plot sales values 
# Notice that values with 0 is mostly because the store was closed
rossmann_df["Sales"].plot(kind='hist',bins=70,xlim=(0,15000),ax=axis2)


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b96d748>

In [14]:
average_sales_customers = rossmann_df.groupby('Store')[["Sales", "Customers"]].mean()
average_sales_customers


Out[14]:
Sales Customers
Store
1 3945.704883 467.646497
2 4122.991507 486.045648
3 5741.253715 620.286624
4 8021.769639 1100.057325
5 3867.110403 444.360934
6 4562.375796 525.990446
7 7356.902335 791.474522
8 4610.251592 547.799363
9 5426.816348 479.487261
10 4634.439490 494.332272
11 6683.955414 940.543524
12 6316.608280 746.508493
13 4124.773087 313.641161
14 4555.386412 522.200637
15 5553.619958 552.130573
16 6335.340764 703.158174
17 5259.541401 657.208068
18 5446.005308 592.128450
19 5346.251592 516.579618
20 6334.489446 692.952507
21 4522.041401 425.916136
22 3695.918206 411.779683
23 4565.073248 398.957537
24 7763.773885 690.870488
25 8892.311040 1250.288747
26 5538.498938 461.623142
27 7838.438429 888.786624
28 4332.218684 475.791932
29 6090.403397 540.587049
30 4367.352442 641.006369
... ... ...
1086 6365.240977 695.715499
1087 5228.500000 512.247346
1088 4204.866242 401.199575
1089 8402.582803 829.093418
1090 5343.307856 650.546709
1091 6888.168790 580.252654
1092 9814.584433 955.994723
1093 6904.761146 719.926752
1094 3753.922164 306.767810
1095 3969.285563 562.701699
1096 4512.348195 523.520170
1097 9744.599788 2420.921444
1098 4354.997877 587.133758
1099 7340.338641 928.975584
1100 4423.963907 546.181529
1101 8299.635881 822.659236
1102 5744.054090 579.721636
1103 4161.639066 319.749469
1104 4481.546174 353.845646
1105 3774.736730 421.781316
1106 4073.217622 464.085987
1107 5121.849604 549.465699
1108 4758.597665 488.755839
1109 4043.167546 371.179420
1110 3766.970276 449.000000
1111 4342.968153 373.548832
1112 8465.280255 693.498938
1113 5516.180467 596.763270
1114 17200.196391 2664.057325
1115 5225.296178 358.687898

1115 rows × 2 columns


In [15]:
# Using store_df

# 每一家平均的销售和顾客合并进store_data里
average_sales_customers = rossmann_df.groupby('Store')[["Sales", "Customers"]].mean()
sales_customers_df = DataFrame({'Store':average_sales_customers.index,
                      'Sales':average_sales_customers["Sales"], 'Customers': average_sales_customers["Customers"]}, 
                      columns=['Store', 'Sales', 'Customers'])
store_df = pd.merge(sales_customers_df, store_df, on='Store')

store_df.head()


Out[15]:
Store Sales Customers StoreType Assortment CompetitionDistance CompetitionOpenSinceMonth CompetitionOpenSinceYear Promo2 Promo2SinceWeek Promo2SinceYear PromoInterval
0 1 3945.704883 467.646497 c a 1270.0 9.0 2008.0 0 NaN NaN NaN
1 2 4122.991507 486.045648 a a 570.0 11.0 2007.0 1 13.0 2010.0 Jan,Apr,Jul,Oct
2 3 5741.253715 620.286624 a a 14130.0 12.0 2006.0 1 14.0 2011.0 Jan,Apr,Jul,Oct
3 4 8021.769639 1100.057325 c c 620.0 9.0 2009.0 0 NaN NaN NaN
4 5 3867.110403 444.360934 a a 29910.0 4.0 2015.0 0 NaN NaN NaN

In [16]:
# StoreType 

# Plot StoreType, & StoreType Vs average sales and customers

sns.countplot(x='StoreType', data=store_df, order=['a','b','c', 'd'])

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='StoreType', y='Sales', data=store_df, order=['a','b','c', 'd'],ax=axis1)
sns.barplot(x='StoreType', y='Customers', data=store_df, order=['a','b','c', 'd'], ax=axis2)


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x11457710>

In [17]:
# Assortment 

# Plot Assortment, & Assortment Vs average sales and customers

sns.countplot(x='Assortment', data=store_df, order=['a','b','c'])

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='Assortment', y='Sales', data=store_df, order=['a','b','c'], ax=axis1)
sns.barplot(x='Assortment', y='Customers', data=store_df, order=['a','b','c'], ax=axis2)


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x10675f60>

In [18]:
# Promo2

# Plot Promo2, & Promo2 Vs average sales and customers

sns.countplot(x='Promo2', data=store_df)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='Promo2', y='Sales', data=store_df, ax=axis1)
sns.barplot(x='Promo2', y='Customers', data=store_df, ax=axis2)


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0xf4e5320>

In [19]:
# CompetitionDistance

# fill NaN values
store_df["CompetitionDistance"].fillna(store_df["CompetitionDistance"].median())

# Plot CompetitionDistance Vs Sales
store_df.plot(kind='scatter',x='CompetitionDistance',y='Sales',figsize=(15,4))
store_df.plot(kind='kde',x='CompetitionDistance',y='Sales',figsize=(15,4))


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x10571898>

In [20]:
# What happened to the average sales of a store over time when competition started?
# Example: the average sales for store_id = 6 has dramatically decreased since the competition started

store_id = 6
store_data = rossmann_df[rossmann_df["Store"] == store_id]

average_store_sales = store_data.groupby('Date')["Sales"].mean()

# Get year, and month when Competition started
y = store_df["CompetitionOpenSinceYear"].loc[store_df["Store"]  == store_id].values[0]
m = store_df["CompetitionOpenSinceMonth"].loc[store_df["Store"] == store_id].values[0]

# Plot 
ax = average_store_sales.plot(legend=True,figsize=(15,4),marker='o')
ax.set_xticks(range(len(average_store_sales)))
ax.set_xticklabels(average_store_sales.index.tolist(), rotation=90)

# Since all data of store sales given in rossmann_df starts with year=2013 till 2015,
# So, we need to check if year>=2013 and y & m aren't NaN values.
if y >= 2013 and y == y and m == m:
    plt.axvline(x=((y-2013) * 12) + (m - 1), linewidth=3, color='grey')



In [21]:
# Risk Analysis
# Analyze the risk of a store; Risk(std) Vs Expected(mean)

# .... countiue using store_data
store_average = store_data["Sales"].mean()
store_std     = store_data["Sales"].std()

# Plot
plt.scatter(store_average, store_std,alpha = 0.5,s =np.pi*20)

# Get min & max mean and std of store sales
# Remember that store_df["Sales"] has the average sales for a store
std_sales = rossmann_df.groupby('Store')["Sales"].std()

min_average = store_df["Sales"].min()
max_average = store_df["Sales"].max()
min_std     = std_sales.min()
max_std     = std_sales.max()

# Set the x and y limits of the plot
plt.ylim([min_std, max_std])
plt.xlim([min_average, max_average])

# Set the plot axis titles
plt.xlabel('Expected Sales')
plt.ylabel('Risk')

# Set label
label, x, y = "Store {}".format(store_id), store_average, store_std
plt.annotate(
        label, 
        xy = (x, y), xytext = (50, 50),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        arrowprops = dict(arrowstyle = '-', connectionstyle = 'arc3,rad=-0.3'))


Out[21]:
<matplotlib.text.Annotation at 0x104eefd0>

In [22]:
# Correlation
# Visualize the Correlation between stores

store_piv       = pd.pivot_table(rossmann_df,values='Sales', index='Date', columns=['Store'],aggfunc='sum')
store_pct_chage = store_piv.pct_change().dropna()
store_piv.head()


Out[22]:
Store 1 2 3 4 5 6 7 8 9 10 ... 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
Date
2013-01 128431.0 115171.0 165653.0 234713.0 109442.0 152265.0 188032.0 122293.0 127016.0 130042.0 ... 115116.0 143763.0 143571.0 119190.0 106800.0 128626.0 304489.0 155879.0 456530.0 116482.0
2013-02 125271.0 111114.0 160167.0 230473.0 101126.0 139655.0 178830.0 119828.0 126826.0 122956.0 ... 107923.0 140576.0 127544.0 115328.0 99489.0 122822.0 301544.0 147817.0 446745.0 123114.0
2013-03 145169.0 130525.0 189179.0 255514.0 116860.0 162550.0 213082.0 135408.0 143583.0 133609.0 ... 124035.0 170060.0 148065.0 124393.0 116539.0 138298.0 312362.0 171412.0 516987.0 155033.0
2013-04 118009.0 116878.0 172188.0 226981.0 106141.0 154269.0 197381.0 127236.0 136194.0 131040.0 ... 110478.0 154915.0 132135.0 118303.0 106745.0 131652.0 284870.0 148796.0 497336.0 126212.0
2013-05 118225.0 116379.0 170374.0 235284.0 110292.0 157972.0 210921.0 126886.0 136540.0 130642.0 ... 111092.0 151585.0 131119.0 117329.0 105277.0 128784.0 284762.0 155649.0 503359.0 131329.0

5 rows × 1115 columns


In [23]:
# .... continue Correlation
# Plot correlation between range of stores
start_store = 1
end_store   = 5

fig, (axis1) = plt.subplots(1,1,figsize=(15,5))

# using summation of sales values for each store 
sns.heatmap(store_piv[list(range(start_store, end_store+1))].corr(),annot=True,linewidths=2)

# using percent change for each store
# sns.heatmap(store_pct_chage[list(range(start_store, end_store+1))].corr(),annot=True,linewidths=2


Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x10687780>

In [24]:
# Notice that test_df has only year=2015, and months 8 & 9

# drop Year and Month
rossmann_df.drop(["Year", "Month"], axis=1, inplace=True)
test_df.drop(["Year", "Month"], axis=1, inplace=True)

# Create dummy varibales for DayOfWeek
day_dummies_rossmann  = pd.get_dummies(rossmann_df['DayOfWeek'], prefix='Day')
day_dummies_rossmann.drop(['Day_7'], axis=1, inplace=True)

day_dummies_test  = pd.get_dummies(test_df['DayOfWeek'],prefix='Day')
day_dummies_test.drop(['Day_7'], axis=1, inplace=True)

rossmann_df = rossmann_df.join(day_dummies_rossmann)
test_df     = test_df.join(day_dummies_test)

rossmann_df.drop(['DayOfWeek'], axis=1,inplace=True)
test_df.drop(['DayOfWeek'], axis=1,inplace=True)

In [25]:
# remove all rows(store,date) that were closed
rossmann_df = rossmann_df[rossmann_df["Open"] != 0]

# drop unnecessary columns, these columns won't be useful in prediction
rossmann_df.drop(["Open","Customers", "Date"], axis=1, inplace=True)

In [26]:
# save ids of closed stores, because we will assign their sales value to 0 later(see below)
closed_store_ids = test_df["Id"][test_df["Open"] == 0].values

# remove all rows(store,date) that were closed
test_df = test_df[test_df["Open"] != 0]

# drop unnecessary columns, these columns won't be useful in prediction
test_df.drop(['Open', 'Date'], axis=1,inplace=True)

In [28]:
# Loop through each store, 
# train the model using the data of current store, and predict it's sales values.

rossmann_dic = dict(list(rossmann_df.groupby('Store')))
test_dic     = dict(list(test_df.groupby('Store')))
submission   = Series()
scores       = []

for i in test_dic:
    
    # current store
    store = rossmann_dic[i]
    
    # define training and testing sets
    X_train = store.drop(["Sales","Store"],axis=1)
    Y_train = store["Sales"]
    X_test  = test_dic[i].copy()
    
    store_ids = X_test["Id"]
    X_test.drop(["Id","Store"], axis=1,inplace=True)
    
    # Linear Regression
    lreg = LinearRegression()
    lreg.fit(X_train, Y_train)
    Y_pred = lreg.predict(X_test)
    scores.append(lreg.score(X_train, Y_train))

    # Xgboost
    params = {"objective": "reg:linear",  "max_depth": 10}
    T_train_xgb = xgb.DMatrix(X_train, Y_train)
    X_test_xgb  = xgb.DMatrix(X_test)
    gbm = xgb.train(params, T_train_xgb, 100)
    Y_pred = gbm.predict(X_test_xgb)
    
    # append predicted values of current store to submission
    submission = submission.append(Series(Y_pred, index=store_ids))

# append rows(store,date) that were closed, and assign their sales value to 0
submission = submission.append(Series(0, index=closed_store_ids))

# save to csv file
submission = pd.DataFrame({ "Id": submission.index, "Sales": submission.values})
submission.to_csv('rossmann.csv', index=False)

In [ ]: